0.1 Variable descriptions

0.2 Read in datasets

Language codes (from WALS) – WALS, ISO, ascii-name. These are used to merge across datasets.

codes = read.csv("../data/language_codes.csv") %>%
  select(language, WALS, ISO) %>%
  mutate(ISO = unlist(lapply(strsplit(as.character(ISO),
                                      ","),function(x) x[1])))
# in cases where there are two ISO codes, takes first

Lupyan & Dale (2010): Demographic variables and syntactic compelxity

ld = read.table("../data/lupyan_2010.txt", fill = T, 
               header = T, sep = "\t", na.strings = "*") %>%
    left_join(codes, c("walsCode" = "WALS")) %>%
    rename(pop_log = logpop2, 
           mean.temp = aveTemp,
           n.neighbors = numNeighbors,
           sum.precip = sumPrecip,
           sd.temp = sdTemp,
           sd.precip = sdPrecip,
           lang.family = langFamily,
           lang.genus = langGenus,
           native.country = nativeCountry,
           native.country.area = nativeCountryArea,
           growing.season = growingSeason)

# get means across language
ld.demo = ld %>%
     group_by(ISO) %>%
     summarise_each(funs(mean(., na.rm = TRUE)), 
                    c(8:9, 16:121))  %>%
     select(1:3, 93:95, 100:101, 103:105, 108)

# add in data family
fams = ld %>%
      group_by(ISO) %>%
      filter(row_number() == 1) %>%
      select(ISO, lang.family, lang.genus, native.country,
             native.country.area, language)  

ld.demo = left_join(fams, ld.demo, by = "ISO") %>%ungroup()

WALS features of complexity

# get variables to include 
qual = ld %>%
  select(18:103, 124) %>%
  mutate_each(funs(as.factor)) %>%
  group_by(ISO) %>%
  summarise_each(funs(most.frequent.level)) %>%
  mutate_each(funs(as.factor)) 

ld_feature_names = read.csv("../data/lupyan_2010_all_feature_mappings.csv") 
qualVarNames = intersect(names(qual), ld_feature_names$WALS.feature.name)
qual = select(qual,which(is.element(names(qual), c("ISO", qualVarNames))))

# remap factor levels to complexity values (0,1)
# note two variables that are reported in the paper (NICPOC[59] and CYSIND[39] are missing in the data)

for (i in 1:length(qualVarNames)){
  thisVarLabs = ld_feature_names[ld_feature_names$WALS.feature.name == qualVarNames[i],]
  old = thisVarLabs$ld.level.label
  if (is.na(old)) {print("NA!!!")}
  new = thisVarLabs$ld.complexity
  col_i = grep(qualVarNames[i], colnames(qual))
  qual[,col_i] = mapvalues(as.matrix(qual[,col_i]), 
                           from = as.character(old),
                           to = as.character(new), warn_missing = TRUE)
}

ld.complexity = qual %>%
  mutate_each(funs(as.numeric), -ISO) %>%
  gather(variable, complexity.level, 2:28) %>%
  group_by(ISO) %>%
  summarise(morphological.complexity = sum(complexity.level))

ld.demo.qual = left_join(ld.demo, ld.complexity)

WALS features of complexity - Lupyan and Dale data

ld.c = read.table("../data/lupyan_2010_complexity.txt", fill = T, 
               header = T, sep = "\t", na.strings = "*") %>%
    mutate(lang = tolower(lang)) %>%
    left_join(codes, c("lang" = "language")) %>%
    rename(morphological.complexity.ld = complexity,
           morphological.complexity.ld.centered = complexityCentered) %>%
    select(morphological.complexity.ld, morphological.complexity.ld.centered, ISO)

Bentz, et al. (2015): L2 learners and lexical diversity Note here that we are only looking at languages from the UDHR corpus. The other corpora have smaller languages only.

bentz = read.csv("../data/bentz_2015.csv") %>%
    gather(temp, LDT, starts_with("LDT")) %>% 
    unite(temp1, measure, temp, sep = ".") %>% 
    spread(temp1, LDT) %>%
    filter(text == "UDHR") %>%
    select(iso_639_3, RatioL2, a.LDTscaled, 
           H.LDTscaled, TTR.LDTscaled, Stock,
           Region, L1_speakers, L2_speakers) %>%
    rename(ISO = iso_639_3,
           n.L1.speakers = L1_speakers,
           n.L2.speakers = L2_speakers,
           ratio.L2.L1 = RatioL2,
           stock = Stock,
           region = Region,
           scaled.LDT.TTR = TTR.LDTscaled,
           scaled.LDT.ZM = a.LDTscaled,
           scaled.LDT.H = H.LDTscaled) %>%
    left_join(codes, by="ISO") %>%
    mutate(ISO = as.factor(ISO)) %>%
    select(-language, -WALS) 

# these were taken directly from ethnologue website by ML for missing languages that were high-frequency across the datasets
ethno_sup = read.csv("../data/supplementary_enthnologue.csv")
bentz = full_join(bentz, ethno_sup)

Atkinson (2011)

atkinson = read.csv("../data/atkinson_2011.csv") %>%
  select(normalized.vowel.diversity,
         normalized.consonant.diversity,    
         normalized.tone.diversity,
         normalized.phoneme.diversity, ISO, 
         distance.from.origin) %>%
  mutate(ISO = unlist(lapply(strsplit(as.character(ISO),
                                      " "),function(x) x[1]))) %>%
  left_join(codes, by="ISO") %>%
  select(-language, -WALS)

Lewis & Frank (under review): Complexity Bias

# complexity bias
cb = read.csv("../data/lewis_2015.csv") %>%
      rename(complexity.bias = corr,
              p.complexity.bias = p.corr,
              mono.complexity.bias = mono.cor,
              open.complexity.bias = open.cor) %>%
      left_join(codes, by="language") %>%
    select(-X.1, -X, -lower.ci, -upper.ci, -checked,
           -mean.length) %>%
    distinct(ISO) %>%
   filter(language != "english") %>% # english is an outlier in this dataset because norms were colelcted in english
  select(-language, -WALS)

Futrell, Mahowald, & Gibson (2015): Dependency length

dl = read.csv("../data/futrell_2015.csv") %>%
  left_join(codes, by="language") %>%
  select(-language, -WALS, -fixed.random.baseline.slope, 
         -observed.slope, -m_rand) %>%
  rename(mean.dependency.length = m_obs)

Pellegrino, Coupe, & Marsico (2015): Information density

uid = read.csv("../data/pellegrino_2015.csv")  %>%
  left_join(codes, by="language") %>%
  select(-language, -WALS)

Moran, McCloy, & Wright (2012): Phonemic inventory

phoneme = read.csv("../data/moran_2012.csv") %>%
          select(ISO, pho, con, vow, son, 
                 obs, mon, qua, ton) %>%
          rename(n.phonemes = pho,
                 n.consonants = con,
                 n.vowels = vow,
                 n.sonorants = son,
                 n.obstruents = obs,
                 n.monophthongs = mon,
                 n.qual.monophthongs = qua,
                 n.tones = ton)

Wichmann, et al. (2013): Mean word length

# note is this IPA? - fix this!
ml = read.csv("../data/wichmann_2013.csv") %>% 
  select(1,1:109) %>%
  gather(word,translation,I:name) %>%
  mutate(nchar = unlist(
    lapply(
      lapply(
        strsplit(
          gsub("[[:space:]]", "", translation) ,
          ","), 
           nchar), mean))) %>%
  filter(translation != "")

# subset to only those words in the swadesh list (n = 40)
swadesh.words  = ml[ml$ISO == "gwj", "word"] 
ml = ml %>%
      filter(is.element(word, swadesh.words))

ml.d = ml %>%
        group_by(ISO) %>%
        summarize(mean.length = mean(nchar, na.rm = T)) 

Luniewska, et al. (2015): Age of acquistion (Aoa)

aoa.raw = read.csv("../data/luniewska_2015.csv", 
                       header = T, fileEncoding = "latin1")
  
aoa = aoa.raw %>%
  gather(language_aoa, aoa, grep("aoa", 
                                 names(aoa.raw))) %>%
  mutate(language_aoa = tolower(unlist(lapply(strsplit(
    as.character(language_aoa),"_"), function(x) x[1])))) %>%
  select(-3:-27) %>%
  rename(language = language_aoa) %>%
  left_join(codes, by="language") %>%
  mutate(language = as.factor(language)) %>%
  group_by(ISO) %>%
  summarize(mean.aoa = mean(aoa, na.rm = T))

Merge data frames together by ISO

d = full_join(ld.demo.qual, cb, by = "ISO") %>%
    full_join(ld.c, by = "ISO") %>%
    full_join(bentz, by = "ISO") %>%
    full_join(dl, by = "ISO") %>%
    full_join(uid, by = "ISO") %>%
    full_join(phoneme, by = "ISO") %>%
    full_join(ml.d, by = "ISO") %>%
    full_join(aoa, by = "ISO") %>%
    full_join(atkinson, by = "ISO") %>%
    distinct(ISO) %>%
    filter(!is.na(ISO), ISO !="") %>%
    mutate(ISO = as.factor(ISO))

#d$pop_log= ifelse(is.na(d$pop_log), log(d$n.L1.speakers + d$n.L2.speakers), d$pop_log) # uses bentz values where missing from WALS
#d$ratio.L2.L1= ifelse(is.na(d$ratio.L2.L1), d$n.L2.speakers/d$n.L1.speakers, d$ratio.L2.L1) 

#write.csv(d, "../data/langLearnVar_data.csv")

load data

d = read.csv("../data/langLearnVar_data.csv")[,-1]

0.3 Data cleaning

Remove outliers and check for normality

d.clean = d %>%
  mutate_each(funs(removeOutlier(.,N_EXCLUDE_SDS)), c(-ISO, -lang.family, 
              -native.country, -native.country.area, 
         -lang.genus, -language, -stock, -region))

d.clean %>%
  select(-ISO, -lang.family, -native.country, -native.country.area, 
         -lang.genus, -language, -stock, -region) %>%
  gather(var, value) %>%
  ggplot(aes(sample = value)) + 
  stat_qq(na.rm = T, size = .2) + 
  facet_wrap( ~ var, scales = "free") +
  theme_bw()

The following variables look right-skewed.

NN_vars = c("area", "perimeter", "n.neighbors", "sd.precip",
           "ratio.L2.L1", "n.L1.speakers", "n.L2.speakers",
           "n.phonemes", "n.consonants","n.vowels", "n.sonorants",
           "n.obstruents", "n.monophthongs", "n.qual.monophthongs",
           "n.tones")

Take the log of these variables, and check for normality again.

d.clean = d.clean %>%
  mutate_each(funs(log = log(.), dummy =  sum(.)),
              one_of(NN_vars)) %>%
  select(-contains("dummy")) %>%
  select(-one_of(NN_vars))

d.clean %>%
  select(-ISO, -lang.family, -native.country, -native.country.area, 
         -lang.genus, -language, -stock, -region) %>%
  gather(var, value) %>%
  ggplot(aes(sample = value)) + 
  stat_qq(na.rm = T, size = .2) + 
  facet_wrap( ~ var, scales = "free") +
  theme_bw()

Check for normality of variables using Shapiro-Wilk Normality Test

is.na(d.clean) <- sapply(d.clean, is.infinite)

normality.test = d.clean %>%
  summarise_each(funs(unlist(tidy(shapiro.test(.))[2])),
                 c(-ISO, -lang.family, -native.country,
                   -native.country.area,
                     -lang.genus, -language, -stock, -region)) %>%
  gather(variable, shapiro.p.value) %>%
  mutate(normal = ifelse(shapiro.p.value > .05, TRUE, FALSE)) %>%
  summary()

By this test, only 11 variables normal. But, I think this is the best we can do (?).

Look at histograms of all variables.

demo_vars = c("lat", "lon", "perimeter_log", "n.neighbors_log",
              "mean.temp", "sum.precip",
              "sd.temp", "growing.season", "pop_log", "area_log",
              "n.neighbors_log", "sd.precip_log", "ratio.L2.L1_log",
              "n.L1.speakers_log",
              "n.L2.speakers_log", "distance.from.origin")

lang_vars = c(setdiff(names(d.clean), demo_vars))[-c(1:6, 22,23)]

d.clean %>%
  gather(variable, num, lat:length(d.clean))  %>%
  filter(variable != "stock", variable != "region") %>%
  mutate(var_type = ifelse(is.element(variable, demo_vars),
                           "demo", "lang"),
         num = as.numeric(as.character(num))) %>%
  arrange(var_type) %>% 
  mutate(variable = factor(variable, variable)) %>%
  ggplot(aes(x = num, fill = var_type)) + 
    geom_histogram(position = "identity") +
    facet_wrap( ~ variable, scales = "free") + 
    theme_bw()

Number of languages we have data, for each variable

d.clean %>%
  gather(variable, num, lat:length(d.clean))  %>%
  filter(variable != "stock", variable != "region") %>%
  group_by(variable) %>%
  summarize(counts = length(which(is.na(num) == F)))  %>%
  mutate(var_type = ifelse(is.element(variable, demo_vars),
                           "demo", "lang")) %>%
  mutate(counts_trunc = ifelse(counts > 150, 150, counts))  %>%
  arrange(var_type) %>% 
  mutate(variable = factor(variable, variable)) %>%
  ggplot(aes(y=counts_trunc, x = variable, fill = var_type)) + 
    geom_bar(stat = "identity") +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 90, hjust = 1),
          legend.position = "none") +
    ylab("Number of languages (truncated at 150)") +
    ggtitle("Number languages per measure")

Look at number of languages intersecting each variable

arealControl_vars = c("stock", "region", "lang.family", "lang.genus",
                      "native.country", "native.country.area")

d.clean %>%
  gather(lang_measure, lang_value, 
         which(is.element(names(.), c(lang_vars, arealControl_vars)))) %>%
  group_by(lang_measure) %>%
  gather(pop_measure, pop_value,
         which(is.element(names(.), demo_vars))) %>%
  group_by(lang_measure, pop_measure) %>%
  mutate(both_not_na = !is.na(lang_value) & !is.na(pop_value)) %>%
  summarise(n_languages = length(which(both_not_na == TRUE))) %>%
  ggplot(aes(pop_measure, lang_measure, group = lang_measure)) +
      geom_tile(aes(fill = n_languages)) + 
      geom_text(aes(fill = n_languages, label = n_languages)) +
      scale_fill_gradient(low = "white", high = "red")  +
      xlab("population variables") + 
      ylab("language variables") + 
      ggtitle("Number languages between variables") +
      theme(axis.text.x = element_text(angle = 90, hjust = 1))

0.4 Relationship between language and demographic variables

Fits controling for family – controlling for family in a mixed effect model

# these are the variables we care about for the paper
demo_vars_crit = c("n.neighbors_log",
              "mean.temp", "sum.precip", "sd.temp",
              "pop_log", "area_log", "n.neighbors_log",
              "sd.precip_log", "ratio.L2.L1_log",
              "distance.from.origin")

lang_vars_crit = c("p.complexity.bias","scaled.LDT.TTR",
                   "mean.dependency.length", "mean.length",
                   "mean.aoa", "n.consonants_log", "n.vowels_log",
                   "morphological.complexity", "morphological.complexity.ld")

d.scatter.fam = d.clean %>% 
  select(ratio.L2.L1_log, pop_log, distance.from.origin,
         n.neighbors_log, area_log, mean.temp, sd.temp, sum.precip,
         sd.precip_log, n.consonants_log, n.vowels_log, mean.length,
         p.complexity.bias, scaled.LDT.TTR, morphological.complexity, morphological.complexity.ld, 
         mean.aoa, lang.family, native.country) %>%
  gather(lang_measure, lang_value, 
         which(is.element(names(.), lang_vars_crit))) %>%
  group_by(lang_measure) %>%
  gather(pop_measure, pop_value,
         which(is.element(names(.), demo_vars_crit)))

is.na(d.scatter.fam) <- sapply(d.scatter.fam, is.infinite) 
d.scatter.fam <- filter(d.scatter.fam, !is.na(pop_value),
                        !is.na(lang_value))

# get model fits
d.model.fits = d.scatter.fam %>%
  group_by(lang_measure, pop_measure) %>%
  do(tidy(lmer(lang_value ~ pop_value + (pop_value|lang.family) +
                 (1|native.country), data=.))) %>%
  filter(term == "pop_value") %>%
  mutate(sig = ifelse(abs(statistic) > 1.96, "*", "")) %>%
  mutate(sig.col = ifelse(statistic > 1.96, "pos",
                          ifelse(statistic < -1.96, "neg",
                                 "none"))) %>%
  mutate(pop_value = .1, lang_value = .1) %>% # this is a hack
  ungroup

d.model.fits[d.model.fits == "NaN"] = "NA"

# plot
ggplot(d.scatter.fam, aes(x = pop_value, y = lang_value)) +
  geom_rect(data = d.model.fits, aes(fill = sig.col), 
          xmin = -Inf, xmax = Inf,
          ymin = -Inf, ymax = Inf, alpha = 0.2) +
  geom_point(size = .3) +
  geom_smooth(method = "lm", color = "green") +
  facet_grid(lang_measure ~ pop_measure, scales = "free") +
  scale_fill_manual(values = c( "mediumblue", "grey99","red1")) +
  theme_bw() +
  xlab("Demographic variables") +
  ylab("Language variables") +
  theme(legend.position = "none")

0.5 Relationship between language and demographic variables, with PCA

Look at correlations between PCA variables

# get demographic variables only (excluding L2 because intersections contains only 40 languages)
d.demo =  d.clean %>%
  select(lat, mean.temp, sd.temp, sum.precip, sd.precip_log, 
         area_log, pop_log, n.neighbors_log) %>% 
  mutate(lat = abs(lat))

is.na(d.demo) <- sapply(d.demo, is.infinite) 

demo.corr = cor(d.demo, use = "pairwise.complete")
abs(demo.corr)>.7
##                   lat mean.temp sd.temp sum.precip sd.precip_log area_log
## lat              TRUE      TRUE   FALSE      FALSE          TRUE    FALSE
## mean.temp        TRUE      TRUE   FALSE      FALSE         FALSE    FALSE
## sd.temp         FALSE     FALSE    TRUE      FALSE         FALSE    FALSE
## sum.precip      FALSE     FALSE   FALSE       TRUE         FALSE    FALSE
## sd.precip_log    TRUE     FALSE   FALSE      FALSE          TRUE    FALSE
## area_log        FALSE     FALSE   FALSE      FALSE         FALSE     TRUE
## pop_log         FALSE     FALSE   FALSE      FALSE         FALSE    FALSE
## n.neighbors_log FALSE     FALSE   FALSE      FALSE         FALSE    FALSE
##                 pop_log n.neighbors_log
## lat               FALSE           FALSE
## mean.temp         FALSE           FALSE
## sd.temp           FALSE           FALSE
## sum.precip        FALSE           FALSE
## sd.precip_log     FALSE           FALSE
## area_log          FALSE           FALSE
## pop_log            TRUE           FALSE
## n.neighbors_log   FALSE            TRUE
#exclude sd.precip_log (correlated with lat and mean temp) and lat (correlated with mean.temp) [threshold: > .7]
d.demo =  d.clean %>%
  select(mean.temp, sd.temp, sum.precip, 
         area_log, pop_log, n.neighbors_log) 

# do pca
pca.demo = prcomp(na.omit(d.demo), scale = TRUE)
barplot(pca.demo$sdev/pca.demo$sdev[1])

pca.demo = prcomp(na.omit(d.demo), scale = TRUE, tol = .6)

# look at variance explained
fviz_eig(pca.demo, addlabels = TRUE, 
               linecolor ="red", geom = "line") +
  theme_bw()

# plot loadings
pcs.demo <- cbind(names(d.demo),
                    gather(as.data.frame(pca.demo$rotation), 
                           pc, value))
names(pcs.demo)[1] = "variable"
 
pcs.demo$variable =  factor(pcs.demo$variable , levels = pcs.demo$variable[1:8]) 

# order variables
pcs.demo$magnitude = ifelse(abs(pcs.demo$value)>.2, TRUE, FALSE)
ggplot(pcs.demo) +
  geom_bar(aes(x = variable, y = value,
               fill = magnitude), stat="identity") +
  facet_wrap(~pc) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))  + 
       scale_fill_manual(name = "Magnitude", 
                       values = c("grey","red1")) +
  ggtitle("PCA loadings")

# merge PCAs with full dataset for model fits
d.demo[!is.na(d.demo)] = 0
d.demo[is.na(d.demo)] = 1
good_is = which(rowSums(d.demo) == 0)
d.pca.demo = d.clean[good_is,] %>% cbind(pca.demo$x)

names(d.pca.demo)[names(d.pca.demo) == "PC1"] = "big_and_cold"
names(d.pca.demo)[names(d.pca.demo) == "PC2"] = "big_and_hot"

Principle components by language

ggplot(d.pca.demo) +   
  borders("world", colour="gray50", fill="gray50") +
  geom_point(aes(x = lon, y=lat, 
               color = big_and_cold), size=3) +
  scale_colour_gradient(high="green", low = "white") +
  ggtitle("Wet and hot (PC1)") + 
  mapTheme

ggplot(d.pca.demo) +   
  borders("world", colour="gray50", fill="gray50") +
  geom_point(aes(x = lon, y=lat, 
               color = big_and_hot), size=3) +
  scale_colour_gradient(high="blue", low = "white") +
  ggtitle("big (PC2)") + 
  mapTheme

Model fits using PCA

demo_vars_crit2 = c("big_and_cold", "big_and_hot")

lang_vars_crit2 = c("morphological.complexity",
                    "p.complexity.bias","scaled.LDT.TTR",
                    "mean.length", "mean.aoa",
                    "n.consonants_log", "n.vowels_log")

d.scatter.fam = d.pca.demo %>% 
  select(n.consonants_log, n.vowels_log, 
         mean.length, p.complexity.bias, scaled.LDT.TTR,
         morphological.complexity, mean.aoa,
         lang.family, native.country, big_and_cold, big_and_hot) %>%
  gather(lang_measure, lang_value, 
         which(is.element(names(.), lang_vars_crit2))) %>%
  group_by(lang_measure) %>%
  gather(pop_measure, pop_value,
         which(is.element(names(.), demo_vars_crit2)))

is.na(d.scatter.fam) <- sapply(d.scatter.fam, is.infinite) 
d.scatter.fam <- filter(d.scatter.fam, !is.na(pop_value),
                        !is.na(lang_value))

# get model fits
d.model.fits = d.scatter.fam %>%
  group_by(lang_measure, pop_measure) %>%
  do(tidy(lmer(lang_value ~ pop_value + (pop_value|lang.family) + (1|native.country),
               data=.))) %>%
  filter(term == "pop_value") %>%
  mutate(sig.col = ifelse(statistic > 1.96, "pos",
                          ifelse(statistic < -1.96, "neg",
                                 "none"))) %>%
  mutate(pop_value = .1, lang_value = .1) %>% # this is a hack
  ungroup

d.model.fits[d.model.fits == "NaN"] = "NA"
ggplot(d.scatter.fam, aes(x = pop_value, y = lang_value)) +
  geom_rect(data = d.model.fits, aes(fill = sig.col), 
        xmin = -Inf, xmax = Inf,
        ymin = -Inf, ymax = Inf, alpha = 0.2) +
  geom_point(size = .3) +
  geom_smooth(method = "lm", color = "green") +
  facet_grid(lang_measure~pop_measure, scales = "free") +
  scale_fill_manual(values = c( "mediumblue", "grey99","red1")) +
  theme_bw() +
  xlab("Demographic variables") +
  ylab("Language variables") +
  theme(legend.position = "none")

0.6 Relationship between PC on language variables

Look at correlations between PCA variables

d.lang =  d.clean %>%
  select(morphological.complexity, scaled.LDT.H, 
         mean.length, n.consonants_log, n.vowels_log) 
is.na(d.lang) <- sapply(d.lang, is.infinite) 

lang.corr = cor(d.lang, use = "pairwise.complete")
abs(lang.corr)>.7
##                          morphological.complexity scaled.LDT.H mean.length
## morphological.complexity                     TRUE        FALSE       FALSE
## scaled.LDT.H                                FALSE         TRUE       FALSE
## mean.length                                 FALSE        FALSE        TRUE
## n.consonants_log                            FALSE        FALSE       FALSE
## n.vowels_log                                FALSE        FALSE       FALSE
##                          n.consonants_log n.vowels_log
## morphological.complexity            FALSE        FALSE
## scaled.LDT.H                        FALSE        FALSE
## mean.length                         FALSE        FALSE
## n.consonants_log                     TRUE        FALSE
## n.vowels_log                        FALSE         TRUE
# include all
# do pca
pca.lang = prcomp(na.omit(d.lang), scale = TRUE)
barplot(pca.lang$sdev/pca.lang$sdev[1])

# look at varence explained
fviz_eig(pca.lang, addlabels=TRUE, 
               linecolor ="red", geom = "line") +
  theme_bw()

pca.lang = prcomp(na.omit(d.lang), scale = TRUE, tol = .6)


# plot loadings
pcs.lang <- cbind(names(d.lang),
                    gather(as.data.frame(pca.lang$rotation), 
                           pc, value))
names(pcs.lang)[1] = "variable"
 
pcs.lang$variable =  factor(pcs.lang$variable , levels = pcs.lang$variable[1:8]) # order variables
pcs.lang$magnitude = ifelse(abs(pcs.lang$value)>.2, TRUE, FALSE)
ggplot(pcs.lang) +
  geom_bar(aes(x = variable, y = value,
               fill = magnitude), stat="identity") +
  facet_wrap(~pc) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))  + 
       scale_fill_manual(name = "Magnitude", 
                       values = c("grey","red1")) +
  ggtitle("PCA loadings")

# merge PCAs with full dataset for model fits
d.lang[!is.na(d.lang)] = 0
d.lang[is.na(d.lang)] = 1
good_is = which(rowSums(d.lang) == 0)
d.pca.lang = d.clean[good_is,] %>% cbind(pca.lang$x)

names(d.pca.lang)[names(d.pca.lang) == "PC1"] = "PC1_L"
names(d.pca.lang)[names(d.pca.lang) == "PC2"] = "PC2_L"

Model fits using PCA - all demographic to predict lang principle components

lang_vars_crit2 = c("PC1_L", "PC2_L")

demo_vars_crit2 = c("n.neighbors_log",
              "mean.temp", "sum.precip", "sd.temp",
              "pop_log", "area_log", "n.neighbors_log",
              "sd.precip_log", "ratio.L2.L1_log",
              "distance.from.origin", "mean.aoa")

d.scatter.fam = d.pca.lang %>% 
  select(ratio.L2.L1_log, pop_log, distance.from.origin,
         n.neighbors_log, area_log, mean.temp, sd.temp, sum.precip, mean.aoa, PC1_L, PC2_L, lang.family, native.country) %>%
  gather(lang_measure, lang_value, 
         which(is.element(names(.), lang_vars_crit2))) %>%
  group_by(lang_measure) %>%
  gather(pop_measure, pop_value,
         which(is.element(names(.), demo_vars_crit2)))

is.na(d.scatter.fam) <- sapply(d.scatter.fam, is.infinite) 
d.scatter.fam <- filter(d.scatter.fam, !is.na(pop_value),
                        !is.na(lang_value))

# get model fits
d.model.fits = d.scatter.fam %>%
  group_by(lang_measure, pop_measure) %>%
  do(tidy(lmer(lang_value ~ pop_value + (1|native.country), data=.))) %>%
  filter(term == "pop_value") %>%
  mutate(sig.col = ifelse(statistic > 1.96, "pos",
                          ifelse(statistic < -1.96, "neg",
                                 "none"))) %>%
  mutate(pop_value = .1, lang_value = .1) %>% # this is a hack
  ungroup

d.model.fits[d.model.fits == "NaN"] = "NA"
ggplot(d.scatter.fam, aes(x = pop_value, y = lang_value)) +
  geom_rect(data = d.model.fits, aes(fill = sig.col), 
          xmin = -Inf, xmax = Inf,
          ymin = -Inf, ymax = Inf, alpha = 0.2) +
  geom_point(size = .3) +
  geom_smooth(method = "lm", color = "green") +
  facet_grid(lang_measure~pop_measure, scales = "free") +
  scale_fill_manual(values = c( "mediumblue", "grey99","red1")) +
  theme_bw() +
  xlab("Demographic variables") +
  ylab("Language variables") +
  theme(legend.position = "none")

All PCAs

all.pca = left_join(d.pca.demo, d.pca.lang[,c("PC1_L", "PC2_L", "ISO")],
                    by = "ISO")

demo_vars_crit2 = c("big_and_cold", "big_and_hot")
lang_vars_crit2 = c("PC1_L", "PC2_L")

d.scatter.fam = all.pca %>% 
  select(PC1_L, PC2_L, big_and_cold, big_and_hot, 
         lang.family, native.country, ISO) %>%
  gather(lang_measure, lang_value, 
         which(is.element(names(.), lang_vars_crit2))) %>%
  group_by(lang_measure) %>%
  gather(pop_measure, pop_value,
         which(is.element(names(.), demo_vars_crit2)))

is.na(d.scatter.fam) <- sapply(d.scatter.fam, is.infinite) 
d.scatter.fam <- filter(d.scatter.fam, !is.na(pop_value),
                        !is.na(lang_value))

# get model fits
d.model.fits = d.scatter.fam %>%
  group_by(lang_measure, pop_measure) %>%
  do(tidy(lmer(lang_value ~ pop_value + (pop_value|lang.family) +
                 (1|native.country), data=.))) %>%
  filter(term == "pop_value") %>%
  mutate(sig.col = ifelse(statistic > 1.96, "pos",
                          ifelse(statistic < -1.96, "neg",
                                 "none"))) %>%
  mutate(pop_value = .1, lang_value = .1) %>% # this is a hack
  ungroup

as.data.frame(d.model.fits)
##   lang_measure  pop_measure      term    estimate std.error  statistic
## 1        PC1_L big_and_cold pop_value  0.07914391 0.1937002  0.4085897
## 2        PC1_L  big_and_hot pop_value -0.39135752 0.2090666 -1.8719273
## 3        PC2_L big_and_cold pop_value -0.13234139 0.2538150 -0.5214088
## 4        PC2_L  big_and_hot pop_value  0.20532594 0.1538443  1.3346343
##   group sig.col pop_value lang_value
## 1 fixed    none       0.1        0.1
## 2 fixed    none       0.1        0.1
## 3 fixed    none       0.1        0.1
## 4 fixed    none       0.1        0.1
d.model.fits[d.model.fits == "NaN"] = "NA"
ggplot(d.scatter.fam, aes(x = pop_value, y = lang_value)) +
  geom_rect(data = d.model.fits, 
          xmin = -Inf, xmax = Inf,
          ymin = -Inf, ymax = Inf, alpha = 0.2) +
  geom_point(size = .3) +
  geom_smooth(method = "lm", color = "green") +
  facet_grid(lang_measure~pop_measure, scales = "free") +
  #scale_fill_manual(values = c( "mediumblue", "grey99","red1")) +
  theme_bw() +
  xlab("Demographic variables") +
  ylab("Language variables") +
  theme(legend.position = "none")

0.7 Replicate Lupyan and Dale (2010) complexity score plot (Fig. 3)

ld.demo.qual %>%
  mutate(morphological.complexity.factor =
           as.factor(morphological.complexity)) %>%
  group_by(morphological.complexity.factor) %>%
  multi_boot(column="pop_log",
              summary_groups = c("morphological.complexity.factor"),
              statistics_functions = c("mean", "ci_lower","ci_upper")) %>%
  mutate(morphological.complexity =  
           as.numeric(as.character(morphological.complexity.factor))) %>%
  ggplot() +
  geom_pointrange(aes(x= morphological.complexity.factor,
                      y = mean, ymin = ci_lower, ymax = ci_upper)) + 
  geom_smooth(method = "lm",aes(morphological.complexity-5, mean)) +
  # theres a weird offset because complexity is a factor?
  ylab("Log population") + 
  xlab("Morphological complexity") +
  ggtitle("Lupyan and Dale (2010) Fig. 3 reproduction") +
  theme_bw() 

0.8 Relationship between language variables for AOA

# predicted relationship with AOA  with TTR and number of consonants

tidy(lmer(mean.aoa ~ morphological.complexity + (morphological.complexity|lang.family) + (1|native.country), d.clean))[2, "statistic"]
cor.test(d.clean$mean.aoa, d.clean$morphological.complexity)

tidy(lmer(mean.aoa ~ p.complexity.bias + (p.complexity.bias|lang.family) + (1|native.country), d.clean))[2, "statistic"]
cor.test(d.clean$mean.aoa, d.clean$p.complexity.bias)

# **
tidy(lmer(mean.aoa ~ scaled.LDT.TTR + (scaled.LDT.TTR|lang.family) + (1|native.country), d.clean))[2, "statistic"]
cor.test(d.clean$mean.aoa, d.clean$scaled.LDT.TTR)
cor.test(d.clean$mean.aoa, d.clean$scaled.LDT.H)
cor.test(d.clean$mean.aoa, d.clean$scaled.LDT.ZM)

tidy(lmer(mean.aoa ~ mean.dependency.length + (mean.dependency.length|lang.family) + (1|native.country), d.clean))[2, "statistic"]
cor.test(d.clean$mean.aoa, d.clean$mean.dependency.length)

tidy(lmer(mean.aoa ~ mean.length + (mean.length|lang.family) + (1|native.country), d.clean)) [2, "statistic"]
cor.test(d.clean$mean.aoa, d.clean$mean.length)

# **
tidy(lmer(mean.aoa ~ n.consonants_log + (n.consonants_log|lang.family) + (1|native.country) , d.clean))[2, "statistic"]
tidy(lmer(mean.aoa ~ normalized.consonant.diversity  + (1|native.country) , d.clean)) [2, "statistic"]

tidy(lmer(mean.aoa ~ n.phonemes_log + (n.phonemes_log|lang.family) + (1|native.country) , d.clean))[2, "statistic"]

tidy(lmer(mean.aoa ~ n.vowels_log + (n.vowels_log|lang.family) + (1|native.country) , d.clean))[2, "statistic"]

tidy(lmer(mean.aoa ~ syllable.rate  + (1|native.country) , d.clean))[2, "statistic"]

tidy(lmer(mean.aoa ~ information.density  + (1|native.country) , d.clean))[2, "statistic"]

tidy(lmer(mean.aoa ~ information.rate  + (1|native.country) , d.clean)) [2, "statistic"]

ggplot(d.clean, aes(mean.length, mean.aoa, label = ISO)) +
  geom_point() +
  geom_label() +
  geom_smooth(method = "lm")

ggplot(d.clean, aes(scaled.LDT.TTR, mean.aoa, label = ISO)) +
  geom_point() +
  geom_label() +
  geom_smooth(method = "lm")

ggplot(d.clean, aes(n.consonants_log, mean.aoa, label = ISO)) +
  geom_point() +
  geom_label() +
  geom_smooth(method = "lm")

ggplot(d.clean, aes(n.phonemes_log, mean.aoa, label = ISO)) +
  geom_point() +
  geom_label() +
  geom_smooth(method = "lm")

ggplot(d.clean, aes(n.vowels_log, mean.aoa, label = ISO)) +
  geom_point() +
  geom_label() +
  geom_smooth(method = "lm")

ggplot(d.clean, aes(mean.dependency.length , mean.aoa, label = ISO)) +
  geom_point() +
  geom_label() +
  geom_smooth(method = "lm")